library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between population and average spread of COVID-19 across the six inhabeted continents.
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X" "X.1"
[4] "X.2" "X.3" "X.4"
[7] "X.5" "X.6" "X.7"
[10] "X.8" "X.9" "X.10"
[13] "X.11" "X.12" "X.13"
[16] "X.14" "X.15" "X.16"
[19] "X.17" "X.18" "X.19"
[22] "X.20" "X.21" "X.22"
[25] "X.23" "X.24" "X.25"
[28] "X.26" "X.27" "X.28"
[31] "X.29" "X.30" "X.31"
[34] "X.32" "X.33" "X.34"
[37] "X.35" "X.36" "X.37"
[40] "X.38" "X.39" "X.40"
[43] "X.41" "X.42" "X.43"
[46] "X.44" "X.45" "X.46"
[49] "X.47" "X.48" "X.49"
[52] "X.50" "X.51" "X.52"
[55] "X.53" "X.54" "X.55"
[58] "X.56" "X.57" "X.58"
[61] "X.59" "X.60" "X.61"
[64] "X.62" "X.63" "X.64"
[67] "X.65" "X.66" "X.67"
[70] "X.68" "X.69" "X.70"
[73] "X.71" "X.72" "X.73"
[76] "X.74" "X.75" "X.76"
[79] "X.77" "X.78" "X.79"
[82] "X.80" "X.81" "X.82"
[85] "X.83" "X.84" "X.85"
[88] "X.86" "X.87" "X.88"
[91] "X.89" "X.90" "X.91"
[94] "X.92" "X.93" "X.94"
[97] "X.95" "X.96" "X.97"
[100] "X.98" "X.99" "X.100"
[103] "X.101" "X.102" "X.103"
[106] "X.104" "X.105" "X.106"
[109] "X.107" "X.108" "X.109"
[112] "X.110" "X.111" "X.112"
[115] "X.113" "X.114" "X.115"
[118] "X.116" "X.117" "X.118"
[121] "X.119" "X.120" "X.121"
[124] "X.122" "X.123" "X.124"
[127] "X.125" "X.126" "X.127"
[130] "X.128" "X.129" "X.130"
[133] "X.131" "X.132" "X.133"
[136] "X.134" "X.135" "X.136"
[139] "X.137" "X.138" "X.139"
[142] "X.140" "X.141" "X.142"
[145] "X.143" "X.144" "X.145"
[148] "X.146" "X.147" "X.148"
[151] "X.149" "X.150" "X.151"
[154] "X.152" "X.153" "X.154"
[157] "X.155" "X.156" "X.157"
[160] "X.158" "X.159" "X.160"
[163] "X.161" "X.162" "X.163"
[166] "X.164" "X.165" "X.166"
[169] "X.167" "X.168" "X.169"
[172] "X.170" "X.171" "X.172"
[175] "X.173" "X.174" "X.175"
[178] "X.176" "X.177" "X.178"
[181] "X.179" "X.180" "X.181"
[184] "X.182" "X.183" "X.184"
[187] "X.185" "X.186" "X.187"
[190] "X.188" "X.189" "X.190"
[193] "X.191" "X.192" "X.193"
[196] "X.194" "X.195" "X.196"
[199] "X.197" "X.198" "X.199"
[202] "X.200" "X.201" "X.202"
[205] "X.203" "X.204" "X.205"
[208] "X.206" "X.207" "X.208"
[211] "X.209" "X.210" "X.211"
[214] "X.212" "X.213" "X.214"
[217] "X.215" "X.216" "X.217"
[220] "X.218" "X.219" "X.220"
[223] "X.221" "X.222" "X.223"
[226] "X.224" "X.225" "X.226"
[229] "X.227" "X.228" "X.229"
[232] "X.230" "X.231" "X.232"
[235] "X.233" "X.234" "X.235"
[238] "X.236" "X.237" "X.238"
[241] "X.239" "X.240" "X.241"
[244] "X.242" "X.243" "X.244"
[247] "X.245" "X.246" "X.247"
[250] "X.248" "X.249" "X.250"
[253] "X.251" "X.252" "X.253"
[256] "X.254"
COVID %>%
head()
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth" "death"
[7] "migr" "maternal" "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity" "underweight" "educ"
[19] "unemploymentYouth" "GDP" "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax" "budget" "debt"
[31] "inflation" "discount" "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports" "gold" "externalDebt"
[43] "homeStock" "abroadStock" "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro" "elecRenew" "oilProd"
[55] "oilExp" "oilImp" "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp" "gasImp" "gasRes"
[67] "mainlines" "cell" "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
countryRegions
countryRegions %>%
nrow()
[1] 254
countryRegions %>%
names()
[1] "ISO3" "ADMIN" "REGION" "continent" "GEO3major" "GEO3" "IMAGE24" "GLOCAF"
[9] "Stern" "SRESmajor" "SRES" "GBD" "AVOIDnumeric" "AVOIDname" "LDC" "SID"
[17] "LLDC"
countryRegions %>%
head()
COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( Code = X ) %>%
rename(date = X.1 ) %>%
rename(casesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,2,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate(date = mdy(date)) %>%
mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
TidyCOVID
EVELYN pls explain what an instance represents
We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.
Labels <-
countryRegions %>%
subset(select = c("ISO3", "REGION")) %>%
rename(continent = REGION)
Labels
We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3))
RelevantCountryData
Calculate the number of cases in each country by multiplying casesPerMillion by population (in millions). This variable is now a standardized metric with which we can compare countries.
COVIDGrowth <-
inner_join(TidyCOVID, RelevantCountryData, by = c("country")) %>%
mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0)))
COVIDGrowth <-
COVIDGrowth %>%
left_join(Labels, by = c("Code" = "ISO3"))
Column `Code`/`ISO3` joining factor and character vector, coercing into character vector
COVIDGrowth
This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize /bwhen/ countries first became infected.
FirstInstance <-
COVIDGrowth %>%
filter(cases != 0) %>%
group_by(country, continent) %>%
summarise(beginningofspread = min(date))
FirstInstance
This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)
DailySpread <-
left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
filter(date == "2020-04-05") %>%
mutate(dayselapsed = date - beginningofspread) %>%
mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
subset(select = c("country", "beginningofspread", "dailyspread"))
DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread
COVIDFinal <-
left_join(COVIDGrowth, DailySpread, by = c("country"))
COVIDFinal
COVIDFinal %>%
group_by(date) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point()
na.omit(COVIDFinal) %>%
group_by(date, continent) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point() +
facet_wrap(~continent)
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspread = mean(dailyspread)) %>%
arrange(desc(dailyspread)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
xlab("Country") +
ylab("Spread of COVID-19 cases Per Day")
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point()
WITHOUT OUTLIERS SHOWS STILL STRONG POSITIVE CORRELATION
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point() +
xlim(0,500000000)
ylim(0, 40000)
<ScaleContinuousPosition>
Range:
Limits: 0 -- 4e+04
na.omit(FirstInstance) %>%
ggplot(aes(x = beginningofspread, fill = continent)) +
geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions="all") +
xlab("Country's First Case of COVID-19") +
theme(panel.background = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank())